Using a randomForest to find lowcarbon and crash locations in the JULES ensemble.
Useful tutorial for random forests in R:
https://www.r-bloggers.com/2021/04/random-forest-in-r/
set.seed(42)
library(RColorBrewer)
library(e1071)
library(MASS)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
load('data/wave0_summary.RData')
d <- ncol(X)
lowcarbon_ix <- which(Y_char =='LOWCARBON')
crashed_ix <- which(Y_char == 'CRASHED')
x1 <- X[lowcarbon_ix , ]
x2 <- X[crashed_ix ,]
XY <- rbind(x1, x2)
pairs(XY,
lower.panel=function(x, y, ...) {
Xx <- x[seq_len(nrow(x1))]
Xy <- y[seq_len(nrow(x1))]
points(Xx, Xy, col = 'blue', pch = 19, cex = 0.8)
},
upper.panel=function(x, y, ...) {
Yx <- x[(nrow(x1) + 1):length(x)]
Yy <- y[(nrow(x1) + 1):length(y)]
points(Yx, Yy, col = 'red', pch = 19, cex = 0.8)
},
gap = 0,
xlim = c(0,1), ylim = c(0,1),
labels = 1:d,
oma = c(2, 18, 2, 2)) # move the default tick labels off the plot
reset <- function()
{
# Allows annotation of graphs, resets axes
par(mfrow=c(1, 1), oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
}
reset()
legend('left', legend = paste(1:d, colnames(lhs)), cex = 1.1, bty = 'n')
legend('topleft', pch = 19, col = c('red', 'blue'), legend = c('CRASHED', 'LOWCARBON'), bty = 'n', inset = 0.02, cex = 1.1 )
data_df <- data.frame(X, y = as.factor(Y_char))
ix_train <- 1:399
ix_test <- 400:499
train <- data_df[ix_train, ]
test <- data_df[ix_test, ]
rf <- randomForest(y~., data=train, proximity=TRUE)
print(rf)
##
## Call:
## randomForest(formula = y ~ ., data = train, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 7.52%
## Confusion matrix:
## CRASHED LOWCARBON RAN class.error
## CRASHED 0 6 12 1.00000000
## LOWCARBON 0 50 9 0.15254237
## RAN 0 3 319 0.00931677
p1 <- predict(rf, train)
confusionMatrix(p1, train$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CRASHED LOWCARBON RAN
## CRASHED 18 0 0
## LOWCARBON 0 59 0
## RAN 0 0 322
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9908, 1)
## No Information Rate : 0.807
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: CRASHED Class: LOWCARBON Class: RAN
## Sensitivity 1.00000 1.0000 1.000
## Specificity 1.00000 1.0000 1.000
## Pos Pred Value 1.00000 1.0000 1.000
## Neg Pred Value 1.00000 1.0000 1.000
## Prevalence 0.04511 0.1479 0.807
## Detection Rate 0.04511 0.1479 0.807
## Detection Prevalence 0.04511 0.1479 0.807
## Balanced Accuracy 1.00000 1.0000 1.000
p2 <- predict(rf, test)
confusionMatrix(p2, test$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CRASHED LOWCARBON RAN
## CRASHED 0 0 0
## LOWCARBON 4 18 3
## RAN 2 4 69
##
## Overall Statistics
##
## Accuracy : 0.87
## 95% CI : (0.788, 0.9289)
## No Information Rate : 0.72
## P-Value [Acc > NIR] : 0.0002808
##
## Kappa : 0.679
##
## Mcnemar's Test P-Value : 0.1048630
##
## Statistics by Class:
##
## Class: CRASHED Class: LOWCARBON Class: RAN
## Sensitivity 0.00 0.8182 0.9583
## Specificity 1.00 0.9103 0.7857
## Pos Pred Value NaN 0.7200 0.9200
## Neg Pred Value 0.94 0.9467 0.8800
## Prevalence 0.06 0.2200 0.7200
## Detection Rate 0.00 0.1800 0.6900
## Detection Prevalence 0.00 0.2500 0.7500
## Balanced Accuracy 0.50 0.8642 0.8720
plot(rf)
t <- tuneRF(train[,-5], train[,5],
stepFactor = 0.5,
plot = TRUE,
ntreeTry = 150,
trace = TRUE,
improve = 0.05)
## mtry = 10 OOB error = 0.08717852
## Searching left ...
## mtry = 20 OOB error = 0.08900195
## -0.020916 0.05
## Searching right ...
## mtry = 5 OOB error = 0.08722938
## -0.0005833487 0.05
hist(treesize(rf),
main = "No. of Nodes for the Trees",
col = "grey")
#Variable Importance
varImpPlot(rf,
sort = T,
n.var = 10,
main = "Top 10 - Variable Importance")
importance(rf)
## MeanDecreaseGini
## alpha_io 2.682637
## a_wl_io 2.763502
## bio_hum_cn 3.341003
## b_wl_io 21.133500
## dcatch_dlai_io 2.165257
## dqcrit_io 3.323660
## dz0v_dh_io 2.146042
## f0_io 34.293527
## fd_io 2.067817
## g_area_io 1.862012
## g_root_io 1.768203
## g_wood_io 2.068497
## gs_nvg_io 1.676781
## hw_sw_io 3.065932
## kaps_roth 2.699277
## knl_io 2.627970
## lai_max_io 2.697069
## lai_min_io 2.862380
## lma_io 1.982686
## n_inorg_turnover 1.523937
## nmass_io 2.651071
## nr_io 2.250479
## retran_l_io 2.732983
## retran_r_io 2.342289
## r_grow_io 2.141003
## rootd_ft_io 4.591724
## sigl_io 2.353085
## sorp 1.812005
## tleaf_of_io 3.416385
## tlow_io 2.150968
## tupp_io 2.263766
## l_vg_soil 2.136931
#MeanDecreaseGini
partialPlot(rf, train, f0_io)
partialPlot(rf, train, b_wl_io)
MDSplot(rf, train$y)
samp <- runif(16000)
samp_mat <- matrix(data = samp, nrow = 500)
colnames(samp_mat) <- colnames(X)
samp_pred <- predict(rf, samp_mat)
pal = c('red', 'blue','grey')
plot(samp_mat[,4], samp_mat[,8], col = pal[samp_pred], xlab = colnames(X)[4], ylab = colnames(X)[8])
pairs(
samp_mat[samp_pred=='LOWCARBON', ],
gap = 0,
col = 'blue',
upper.panel = NULL,
pch = 20,
labels = 1:d
)
reset()
legend('right', legend = paste(1:d, colnames(lhs)), cex = 1.1, bty = 'n')
legend('topright', pch = 19, col = c('red', 'blue'), legend = c('failed', 'zero carbon cycle'), bty = 'n', inset = 0.02, cex = 1.1 )
## Density of lowcarbon
# This function does a simple bootstrap test of the randomForest algorithm
# for a set number of ensemble members (ntrain). It splits the data into a training
# set of ntrain rows and a test set of ntest rows, trains the model, predicts the
# test set and records the misclassification rate for each rep in the oputput.
boot_rf <- function(data, ntrain, ntest, nreps){
outvec <- rep(NA, nreps)
for(i in 1:nreps){
n_train_and_test <- ntrain+ntest
all_samp_ix <- sample(1:nrow(X), n_train_and_test)
train_ix <- all_samp_ix[1:ntrain]
test_ix <- all_samp_ix[(ntrain+1):n_train_and_test]
train <- data[train_ix, ]
test <- data[test_ix ,]
rf <- randomForest(y~., data=train, proximity=TRUE)
pred_test <- predict(rf, test)
conf_test <- confusionMatrix(pred_test, test$y)
out <- (1 - conf_test$overall['Accuracy'])
outvec[i] <- out
}
outvec
}
# test the above
test_boot <- boot_rf(data = data_df, ntrain = 100, ntest = 10, nreps = 10)
nreps <- 50
nens_vec <- seq(from = 100, to = 400, by = 20)
outmat <- matrix(nrow = nreps, ncol = length(nens_vec))
for(j in 1:length(nens_vec)){
boot_rf_out <- boot_rf(data = data_df, ntrain = nens_vec[j], ntest = 50, nreps = nreps)
outmat[, j] <- boot_rf_out
}
# 1- accuracy mean
oma_mean <- apply(outmat,2,mean)
plot(nens_vec, oma_mean, type = 'b' )
boxplot( outmat)